alldata <- read_csv('alldata.csv')
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X1_1 = col_double(),
##   Heading = col_character(),
##   Text = col_character(),
##   Rating = col_double(),
##   Time = col_datetime(format = ""),
##   Type = col_character()
## )
iphone8 <- filter(alldata, Type == 'iphone8')
iphoneX <- filter(alldata, Type == 'iphoneX')
iphone11pm <- filter(alldata, Type == 'iphone11promax')

iPhone 8

## get text into tidy format, replace a few special words and remove stop words
reviewsTidy <- iphone8 %>%
  select(X1,Text) %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words)
## Joining, by = "word"
## get raw word frequencies  

wordCount <- reviewsTidy %>%
  count(word,sort = TRUE)

## remove common words and lemmatize remaining
commonWords <- c('iphone','phone','apple','iPhone')

reviewsTidy <- reviewsTidy %>%
  mutate(lemma = lemmatize_words(word))

wordCount <- reviewsTidy %>%
  count(lemma,sort = TRUE)

## remove infrequent words 
freqLimit <- 20
vocab <- wordCount %>%
  filter(n >= freqLimit)

reviewsTidy <- reviewsTidy %>%
  filter(lemma %in% vocab$lemma) %>%
  filter(!lemma %in% commonWords)


## remove very short reviews

reviewLength <- reviewsTidy %>%
  count(X1)

minLength <- 5

reviewLength <- reviewLength %>%
  filter(n >= minLength)

## create document term matrix for use in LDA 

dtmUni <- reviewsTidy %>%
  filter(X1 %in% reviewLength$X1) %>%
  count(X1,lemma) %>%
  cast_dtm(X1, lemma, n)
numTopics <- c(10,20,30,40)


for (theNum in c(1:length(numTopics))){
  theLDA <- LDA(dtmUni, k = numTopics[theNum], method="Gibbs",
                control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
  
  saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone8',numTopics[theNum],'.rds'))
}
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone8',theNumTopics,'.rds'))

theTopicsBeta <- tidy(theLDA, matrix = "beta")

TopicsTop <- theTopicsBeta %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  ungroup() %>%
  mutate(x = n():1)  # for plotting

plTopicWeights8 <- TopicsTop %>%
  mutate(topic=factor(topic)) %>%
  ggplot(aes(x=x,y=beta,fill=topic)) + 
  geom_bar(stat='identity',show.legend = F) + 
  coord_flip() + 
  facet_wrap(~topic,scales='free') +
  scale_x_continuous(breaks = TopicsTop$x,
                     labels = TopicsTop$term,
                     expand = c(0,0)) + 
  labs(title='Top Words by Topic',
       subtitle = paste0(theNumTopics,' Topic LDA of ',
                         prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 8 Reviews'),
       caption = 'Note: The words "iPhone", "Apple" and "Phone" and reviews less than 5 words long have been removed.',
       x = 'word',
       y = 'beta')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
        axis.text.y = element_text(size = 6))

plTopicWeights8

assignments <- augment(theLDA, data = dtmUni)

theDocID <- '23'
theDoc <- assignments %>%
  filter(document == theDocID)


tmp <- reviewsTidy %>% 
  filter(X1 == theDocID) %>%
  left_join(select(theDoc,term,.topic), by = c('lemma'='term')) %>%
  distinct()



theOrg <- iphone8 %>%
  filter(X1==theDocID) %>%
  select(X1,Text) %>%
  unnest_tokens(word,Text) %>%
  left_join(select(tmp,word,.topic), by = 'word') %>%
  mutate(wordID = row_number())

theBreaks <- c(1:10)
theY <- c(100:1)
dfIndex <- data.frame( y = rep(theY,each = length(theBreaks)),
                       x = rep(theBreaks, length(theY)) ) %>%
  mutate(wordID = row_number())


theOrg %>%
  left_join(dfIndex, by = 'wordID') %>%
  ggplot(aes(x=factor(x),y=y,label=word,color=factor(.topic))) + 
  geom_text() + 
  theme_bw() + 
  labs(x = '', y = '', title = paste0('ReviewID ',theDocID)) + 
  scale_color_discrete(name="Topic") + 
  theme(panel.grid.minor=element_blank(),
        panel.grid.major=element_blank(),
        axis.ticks = element_blank(), 
        axis.text = element_blank())

## @knitr ReviewClustering 

theTopicsGamma <- tidy(theLDA, matrix = "gamma")

theSampleReviews <- reviewLength %>%
  sample_n(5)

theTopicsGamma %>%
  filter(document %in% theSampleReviews$X1) %>%
  ggplot(aes(x=topic,y=gamma,fill=document)) + 
  geom_bar(stat='identity') + 
  facet_wrap(~document,ncol = 1) + 
  theme(legend.position = 'none') + 
  scale_y_continuous(labels = percent) + 
  labs(title = '5 Random Reviews',
       y = 'Topic Weight (Gamma)')

## @knitr TopicEvolution

iphone8 <- iphone8 %>%
  mutate(ID = as.character(X1))

theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
  inner_join(iphone8,by=c('document'='ID'))

theTopicsGamma %>%
  group_by(topic, Year = year(Time)) %>%
  summarize(mean = mean(gamma)) %>%
  ggplot(aes(x=Year,y=mean,group=topic)) + geom_line() + 
  facet_wrap(~topic,labeller = label_both) + 
  scale_y_continuous(labels = percent) + 
  labs(title = 'Topic Evolution', x = 'Year of Review', y = 'Average Topic Weight') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## @knitr TopicSentiments

theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
  inner_join(iphone8,by=c('document'='ID'))

theTopicsGamma %>%
  group_by(Rating,topic) %>%
  summarize(mean = mean(gamma)) %>%
  mutate(topic = factor(topic)) %>%
  ggplot(aes(x=Rating,y=mean,fill=topic)) + 
  geom_bar(stat='identity') + 
  facet_wrap(~topic, scales = 'free', labeller = label_both) + 
  scale_y_continuous(labels = percent) + 
  theme(legend.position = 'none') + 
  labs(title = 'Topic Weights by Star Rating', x = 'Rating', y = 'Average Topic Weight')  

## @knitr LDAAriaUniandBigrams

reviewsTidyUni <- reviewsTidy %>%
  group_by(X1) %>%
  mutate(wordNumber = row_number())  %>%
  ungroup()

plTopicWeights8

## all reviews 
tmpUni <-  reviewsTidyUni %>%
  rename(lemma1 = lemma) %>%
  mutate(lemma2 = lead(lemma1),
         Index1 = wordNumber,
         Index2 = lead(wordNumber),
         bilemma = paste0(lemma1,'_',lemma2)) 

BiLimit <- 100 

freqBi <- tmpUni %>%
  count(bilemma,sort = T) %>%
  filter(n >= BiLimit)

newBi <- tmpUni %>%
  filter(bilemma %in% freqBi$bilemma)

tmpRemoveRows <- newBi %>%
  select(Index1,Index2,bilemma,X1) %>%
  gather(Index,wordNumber,-bilemma,-X1) %>%
  select(X1,wordNumber)

newBi <- newBi %>%
  select(X1,bilemma) %>%
  rename(lemma1 = bilemma) 

reviewsTidyUniBi <- tmpUni %>%
  anti_join(tmpRemoveRows,by = c('X1','wordNumber')) %>%
  select(X1,lemma1) %>%
  bind_rows(newBi)

vocab <- reviewsTidyUniBi %>%
  count(lemma1,sort = T) %>%
  filter(n >= 20)

reviewsTidyUniBi <- reviewsTidyUniBi %>%
  filter(lemma1 %in% vocab$lemma1)


## remove very short reviews

reviewLength <- reviewsTidyUniBi %>%
  count(X1)

minLength <- 5

reviewLength <- reviewLength %>%
  filter(n >= minLength)

## create document term matrix for use in LDA 

dtmBi <- reviewsTidyUniBi %>%
  filter(X1 %in% reviewLength$X1) %>%
  count(X1,lemma1) %>%
  cast_dtm(X1, lemma1, n)


numTopics <- c(10,20,30,40)


for (theNum in c(1:length(numTopics))){
  theLDA <- LDA(dtmBi, k = numTopics[theNum], method="Gibbs",
                control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
  
  saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone8_Bi',numTopics[theNum],'.rds'))
}

## @knitr AnalyzeTopicsUniBi

theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone8_Bi',theNumTopics,'.rds'))

theTopicsBeta <- tidy(theLDA, matrix = "beta")

TopicsTop <- theTopicsBeta %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  ungroup() %>%
  mutate(x = n():1)  # for plotting

plTopicWeights8_2 <- TopicsTop %>%
  mutate(topic=factor(topic)) %>%
  ggplot(aes(x=x,y=beta,fill=topic)) + 
  geom_bar(stat='identity',show.legend = F) + 
  coord_flip() + 
  facet_wrap(~topic,scales='free') +
  scale_x_continuous(breaks = TopicsTop$x,
                     labels = TopicsTop$term,
                     expand = c(0,0)) + 
  labs(title='Topic Model with both Unigrams and Bigrams',
       subtitle = paste0(theNumTopics,' Topic LDA of ',
                         prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 8 Reviews'),
       x = 'word',
       y = 'beta')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
        axis.text.y = element_text(size = 6))
plTopicWeights8_2

theTopicsBetaW <- select(spread(tidy(theLDA, matrix = "beta"),term,beta),-topic)
theTopicsGammaW <- select(spread(tidy(theLDA, matrix = "gamma"),topic,gamma),-document)
theTerms <- colnames(theTopicsBetaW)

theVocab <- vocab %>%
  mutate(word = factor(lemma1,levels=theTerms)) %>%
  arrange(word) %>%
  mutate(word=as.character(word))

json <- createJSON(
  phi = theTopicsBetaW, 
  theta = theTopicsGammaW, 
  doc.length = reviewLength$n, 
  vocab = theTerms, 
  R = theNumTopics,
  term.frequency = theVocab$n
)

serVis(json)
## Loading required namespace: servr

iPhone X

## get text into tidy format, replace a few special words and remove stop words
reviewsTidy <- iphoneX %>%
  select(X1,Text) %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words)
## Joining, by = "word"
## get raw word frequencies  

wordCount <- reviewsTidy %>%
  count(word,sort = TRUE)

## remove common words and lemmatize remaining
commonWords <- c('iphone','phone','apple','iPhone')

reviewsTidy <- reviewsTidy %>%
  mutate(lemma = lemmatize_words(word))

wordCount <- reviewsTidy %>%
  count(lemma,sort = TRUE)

## remove infrequent words 
freqLimit <- 20
vocab <- wordCount %>%
  filter(n >= freqLimit)

reviewsTidy <- reviewsTidy %>%
  filter(lemma %in% vocab$lemma) %>%
  filter(!lemma %in% commonWords)


## remove very short reviews

reviewLength <- reviewsTidy %>%
  count(X1)

minLength <- 5

reviewLength <- reviewLength %>%
  filter(n >= minLength)

## create document term matrix for use in LDA 

dtmUni <- reviewsTidy %>%
  filter(X1 %in% reviewLength$X1) %>%
  count(X1,lemma) %>%
  cast_dtm(X1, lemma, n)
numTopics <- c(10,20,30,40)


for (theNum in c(1:length(numTopics))){
  theLDA <- LDA(dtmUni, k = numTopics[theNum], method="Gibbs",
                control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
  
  saveRDS(theLDA,file=paste0('topicmodels/ldaiPhoneX',numTopics[theNum],'.rds'))
}
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhoneX',theNumTopics,'.rds'))

theTopicsBeta <- tidy(theLDA, matrix = "beta")

TopicsTop <- theTopicsBeta %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  ungroup() %>%
  mutate(x = n():1)  # for plotting

plTopicWeightsX <- TopicsTop %>%
  mutate(topic=factor(topic)) %>%
  ggplot(aes(x=x,y=beta,fill=topic)) + 
  geom_bar(stat='identity',show.legend = F) + 
  coord_flip() + 
  facet_wrap(~topic,scales='free') +
  scale_x_continuous(breaks = TopicsTop$x,
                     labels = TopicsTop$term,
                     expand = c(0,0)) + 
  labs(title='Top Words by Topic',
       subtitle = paste0(theNumTopics,' Topic LDA of ',
                         prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone X Reviews'),
       caption = 'Note: The words "iPhone", "Apple" and "Phone" and reviews less than 5 words long have been removed.',
       x = 'word',
       y = 'beta')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
        axis.text.y = element_text(size = 6))

plTopicWeightsX

assignments <- augment(theLDA, data = dtmUni)

theDocID <- '23'
theDoc <- assignments %>%
  filter(document == theDocID)


tmp <- reviewsTidy %>% 
  filter(X1 == theDocID) %>%
  left_join(select(theDoc,term,.topic), by = c('lemma'='term')) %>%
  distinct()



theOrg <- iphoneX %>%
  filter(X1==theDocID) %>%
  select(X1,Text) %>%
  unnest_tokens(word,Text) %>%
  left_join(select(tmp,word,.topic), by = 'word') %>%
  mutate(wordID = row_number())

theBreaks <- c(1:10)
theY <- c(100:1)
dfIndex <- data.frame( y = rep(theY,each = length(theBreaks)),
                       x = rep(theBreaks, length(theY)) ) %>%
  mutate(wordID = row_number())


theOrg %>%
  left_join(dfIndex, by = 'wordID') %>%
  ggplot(aes(x=factor(x),y=y,label=word,color=factor(.topic))) + 
  geom_text() + 
  theme_bw() + 
  labs(x = '', y = '', title = paste0('ReviewID ',theDocID)) + 
  scale_color_discrete(name="Topic") + 
  theme(panel.grid.minor=element_blank(),
        panel.grid.major=element_blank(),
        axis.ticks = element_blank(), 
        axis.text = element_blank())

## @knitr ReviewClustering 

theTopicsGamma <- tidy(theLDA, matrix = "gamma")

theSampleReviews <- reviewLength %>%
  sample_n(5)

theTopicsGamma %>%
  filter(document %in% theSampleReviews$X1) %>%
  ggplot(aes(x=topic,y=gamma,fill=document)) + 
  geom_bar(stat='identity') + 
  facet_wrap(~document,ncol = 1) + 
  theme(legend.position = 'none') + 
  scale_y_continuous(labels = percent) + 
  labs(title = '5 Random Reviews',
       y = 'Topic Weight (Gamma)')

## @knitr TopicEvolution

iphoneX <- iphoneX %>%
  mutate(ID = as.character(X1))

theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
  inner_join(iphoneX,by=c('document'='ID'))

theTopicsGamma %>%
  group_by(topic, Year = year(Time)) %>%
  summarize(mean = mean(gamma)) %>%
  ggplot(aes(x=Year,y=mean,group=topic)) + geom_line() + 
  facet_wrap(~topic,labeller = label_both) + 
  scale_y_continuous(labels = percent) + 
  labs(title = 'Topic Evolution', x = 'Year of Review', y = 'Average Topic Weight') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## @knitr TopicSentiments

theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
  inner_join(iphoneX,by=c('document'='ID'))

theTopicsGamma %>%
  group_by(Rating,topic) %>%
  summarize(mean = mean(gamma)) %>%
  mutate(topic = factor(topic)) %>%
  ggplot(aes(x=Rating,y=mean,fill=topic)) + 
  geom_bar(stat='identity') + 
  facet_wrap(~topic, scales = 'free', labeller = label_both) + 
  scale_y_continuous(labels = percent) + 
  theme(legend.position = 'none') + 
  labs(title = 'Topic Weights by Star Rating', x = 'Rating', y = 'Average Topic Weight')  

## @knitr LDAAriaUniandBigrams

reviewsTidyUni <- reviewsTidy %>%
  group_by(X1) %>%
  mutate(wordNumber = row_number())  %>%
  ungroup()

plTopicWeightsX

## all reviews 
tmpUni <-  reviewsTidyUni %>%
  rename(lemma1 = lemma) %>%
  mutate(lemma2 = lead(lemma1),
         Index1 = wordNumber,
         Index2 = lead(wordNumber),
         bilemma = paste0(lemma1,'_',lemma2)) 

BiLimit <- 100 

freqBi <- tmpUni %>%
  count(bilemma,sort = T) %>%
  filter(n >= BiLimit)

newBi <- tmpUni %>%
  filter(bilemma %in% freqBi$bilemma)

tmpRemoveRows <- newBi %>%
  select(Index1,Index2,bilemma,X1) %>%
  gather(Index,wordNumber,-bilemma,-X1) %>%
  select(X1,wordNumber)

newBi <- newBi %>%
  select(X1,bilemma) %>%
  rename(lemma1 = bilemma) 

reviewsTidyUniBi <- tmpUni %>%
  anti_join(tmpRemoveRows,by = c('X1','wordNumber')) %>%
  select(X1,lemma1) %>%
  bind_rows(newBi)

vocab <- reviewsTidyUniBi %>%
  count(lemma1,sort = T) %>%
  filter(n >= 20)

reviewsTidyUniBi <- reviewsTidyUniBi %>%
  filter(lemma1 %in% vocab$lemma1)


## remove very short reviews

reviewLength <- reviewsTidyUniBi %>%
  count(X1)

minLength <- 5

reviewLength <- reviewLength %>%
  filter(n >= minLength)

## create document term matrix for use in LDA 

dtmBi <- reviewsTidyUniBi %>%
  filter(X1 %in% reviewLength$X1) %>%
  count(X1,lemma1) %>%
  cast_dtm(X1, lemma1, n)


numTopics <- c(10,20,30,40)


for (theNum in c(1:length(numTopics))){
  theLDA <- LDA(dtmBi, k = numTopics[theNum], method="Gibbs",
                control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
  
  saveRDS(theLDA,file=paste0('topicmodels/ldaiPhoneX_Bi',numTopics[theNum],'.rds'))
}

## @knitr AnalyzeTopicsUniBi

theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhoneX_Bi',theNumTopics,'.rds'))

theTopicsBeta <- tidy(theLDA, matrix = "beta")

TopicsTop <- theTopicsBeta %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  ungroup() %>%
  mutate(x = n():1)  # for plotting

plTopicWeightsX_2 <- TopicsTop %>%
  mutate(topic=factor(topic)) %>%
  ggplot(aes(x=x,y=beta,fill=topic)) + 
  geom_bar(stat='identity',show.legend = F) + 
  coord_flip() + 
  facet_wrap(~topic,scales='free') +
  scale_x_continuous(breaks = TopicsTop$x,
                     labels = TopicsTop$term,
                     expand = c(0,0)) + 
  labs(title='Topic Model with both Unigrams and Bigrams',
       subtitle = paste0(theNumTopics,' Topic LDA of ',
                         prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone X Reviews'),
       x = 'word',
       y = 'beta')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
        axis.text.y = element_text(size = 6))
plTopicWeightsX_2

theTopicsBetaW <- select(spread(tidy(theLDA, matrix = "beta"),term,beta),-topic)
theTopicsGammaW <- select(spread(tidy(theLDA, matrix = "gamma"),topic,gamma),-document)
theTerms <- colnames(theTopicsBetaW)

theVocab <- vocab %>%
  mutate(word = factor(lemma1,levels=theTerms)) %>%
  arrange(word) %>%
  mutate(word=as.character(word))

json <- createJSON(
  phi = theTopicsBetaW, 
  theta = theTopicsGammaW, 
  doc.length = reviewLength$n, 
  vocab = theTerms, 
  R = theNumTopics,
  term.frequency = theVocab$n
)

serVis(json)

iPhone 11 Pro Max

## get text into tidy format, replace a few special words and remove stop words
reviewsTidy <- iphone11pm %>%
  select(X1,Text) %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words)
## Joining, by = "word"
## get raw word frequencies  

wordCount <- reviewsTidy %>%
  count(word,sort = TRUE)

## remove common words and lemmatize remaining
commonWords <- c('iphone','phone','apple','iPhone')

reviewsTidy <- reviewsTidy %>%
  mutate(lemma = lemmatize_words(word))

wordCount <- reviewsTidy %>%
  count(lemma,sort = TRUE)

## remove infrequent words 
freqLimit <- 20
vocab <- wordCount %>%
  filter(n >= freqLimit)

reviewsTidy <- reviewsTidy %>%
  filter(lemma %in% vocab$lemma) %>%
  filter(!lemma %in% commonWords)


## remove very short reviews

reviewLength <- reviewsTidy %>%
  count(X1)

minLength <- 5

reviewLength <- reviewLength %>%
  filter(n >= minLength)

## create document term matrix for use in LDA 

dtmUni <- reviewsTidy %>%
  filter(X1 %in% reviewLength$X1) %>%
  count(X1,lemma) %>%
  cast_dtm(X1, lemma, n)
numTopics <- c(10,20,30,40)


for (theNum in c(1:length(numTopics))){
  theLDA <- LDA(dtmUni, k = numTopics[theNum], method="Gibbs",
                control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
  
  saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone11pm',numTopics[theNum],'.rds'))
}
theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone11pm',theNumTopics,'.rds'))

theTopicsBeta <- tidy(theLDA, matrix = "beta")

TopicsTop <- theTopicsBeta %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  ungroup() %>%
  mutate(x = n():1)  # for plotting

plTopicWeights11 <- TopicsTop %>%
  mutate(topic=factor(topic)) %>%
  ggplot(aes(x=x,y=beta,fill=topic)) + 
  geom_bar(stat='identity',show.legend = F) + 
  coord_flip() + 
  facet_wrap(~topic,scales='free') +
  scale_x_continuous(breaks = TopicsTop$x,
                     labels = TopicsTop$term,
                     expand = c(0,0)) + 
  labs(title='Top Words by Topic',
       subtitle = paste0(theNumTopics,' Topic LDA of ',
                         prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 11 Pro Max Reviews'),
       caption = 'Note: The words "iPhone", "Apple" and "Phone" and reviews less than 5 words long have been removed.',
       x = 'word',
       y = 'beta')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
        axis.text.y = element_text(size = 6))

plTopicWeights11

assignments <- augment(theLDA, data = dtmUni)

theDocID <- '2519'
theDoc <- assignments %>%
  filter(document == theDocID)


tmp <- reviewsTidy %>% 
  filter(X1 == theDocID) %>%
  left_join(select(theDoc,term,.topic), by = c('lemma'='term')) %>%
  distinct()



theOrg <- iphone11pm %>%
  filter(X1==theDocID) %>%
  select(X1,Text) %>%
  unnest_tokens(word,Text) %>%
  left_join(select(tmp,word,.topic), by = 'word') %>%
  mutate(wordID = row_number())

theBreaks <- c(1:10)
theY <- c(100:1)
dfIndex <- data.frame( y = rep(theY,each = length(theBreaks)),
                       x = rep(theBreaks, length(theY)) ) %>%
  mutate(wordID = row_number())


theOrg %>%
  left_join(dfIndex, by = 'wordID') %>%
  ggplot(aes(x=factor(x),y=y,label=word,color=factor(.topic))) + 
  geom_text() + 
  theme_bw() + 
  labs(x = '', y = '', title = paste0('ReviewID ',theDocID)) + 
  scale_color_discrete(name="Topic") + 
  theme(panel.grid.minor=element_blank(),
        panel.grid.major=element_blank(),
        axis.ticks = element_blank(), 
        axis.text = element_blank())

## @knitr ReviewClustering 

theTopicsGamma <- tidy(theLDA, matrix = "gamma")

theSampleReviews <- reviewLength %>%
  sample_n(5)

theTopicsGamma %>%
  filter(document %in% theSampleReviews$X1) %>%
  ggplot(aes(x=topic,y=gamma,fill=document)) + 
  geom_bar(stat='identity') + 
  facet_wrap(~document,ncol = 1) + 
  theme(legend.position = 'none') + 
  scale_y_continuous(labels = percent) + 
  labs(title = '5 Random Reviews',
       y = 'Topic Weight (Gamma)')

## @knitr TopicEvolution

iphone11pm <- iphone11pm %>%
  mutate(ID = as.character(X1))

theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
  inner_join(iphone11pm,by=c('document'='ID'))

theTopicsGamma %>%
  group_by(topic, Year = year(Time)) %>%
  summarize(mean = mean(gamma)) %>%
  ggplot(aes(x=Year,y=mean,group=topic)) + geom_line() + 
  facet_wrap(~topic,labeller = label_both) + 
  scale_y_continuous(labels = percent) + 
  labs(title = 'Topic Evolution', x = 'Year of Review', y = 'Average Topic Weight') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## @knitr TopicSentiments

theTopicsGamma <- tidy(theLDA, matrix = "gamma") %>%
  inner_join(iphone11pm,by=c('document'='ID'))

theTopicsGamma %>%
  group_by(Rating,topic) %>%
  summarize(mean = mean(gamma)) %>%
  mutate(topic = factor(topic)) %>%
  ggplot(aes(x=Rating,y=mean,fill=topic)) + 
  geom_bar(stat='identity') + 
  facet_wrap(~topic, scales = 'free', labeller = label_both) + 
  scale_y_continuous(labels = percent) + 
  theme(legend.position = 'none') + 
  labs(title = 'Topic Weights by Star Rating', x = 'Rating', y = 'Average Topic Weight')  

## @knitr LDAAriaUniandBigrams

reviewsTidyUni <- reviewsTidy %>%
  group_by(X1) %>%
  mutate(wordNumber = row_number())  %>%
  ungroup()

plTopicWeights11

## all reviews 
tmpUni <-  reviewsTidyUni %>%
  rename(lemma1 = lemma) %>%
  mutate(lemma2 = lead(lemma1),
         Index1 = wordNumber,
         Index2 = lead(wordNumber),
         bilemma = paste0(lemma1,'_',lemma2)) 

BiLimit <- 100 

freqBi <- tmpUni %>%
  count(bilemma,sort = T) %>%
  filter(n >= BiLimit)

newBi <- tmpUni %>%
  filter(bilemma %in% freqBi$bilemma)

tmpRemoveRows <- newBi %>%
  select(Index1,Index2,bilemma,X1) %>%
  gather(Index,wordNumber,-bilemma,-X1) %>%
  select(X1,wordNumber)

newBi <- newBi %>%
  select(X1,bilemma) %>%
  rename(lemma1 = bilemma) 

reviewsTidyUniBi <- tmpUni %>%
  anti_join(tmpRemoveRows,by = c('X1','wordNumber')) %>%
  select(X1,lemma1) %>%
  bind_rows(newBi)

vocab <- reviewsTidyUniBi %>%
  count(lemma1,sort = T) %>%
  filter(n >= 20)

reviewsTidyUniBi <- reviewsTidyUniBi %>%
  filter(lemma1 %in% vocab$lemma1)


## remove very short reviews

reviewLength <- reviewsTidyUniBi %>%
  count(X1)

minLength <- 5

reviewLength <- reviewLength %>%
  filter(n >= minLength)

## create document term matrix for use in LDA 

dtmBi <- reviewsTidyUniBi %>%
  filter(X1 %in% reviewLength$X1) %>%
  count(X1,lemma1) %>%
  cast_dtm(X1, lemma1, n)


numTopics <- c(10,20,30,40)


for (theNum in c(1:length(numTopics))){
  theLDA <- LDA(dtmBi, k = numTopics[theNum], method="Gibbs",
                control = list(alpha = 1/numTopics[theNum],iter=5000,burnin=10000,seed = 1234))
  
  saveRDS(theLDA,file=paste0('topicmodels/ldaiPhone11pm_Bi',numTopics[theNum],'.rds'))
}

## @knitr AnalyzeTopicsUniBi

theNumTopics <- 10
theLDA <- read_rds(paste0('topicmodels/ldaiPhone11pm_Bi',theNumTopics,'.rds'))

theTopicsBeta <- tidy(theLDA, matrix = "beta")

TopicsTop <- theTopicsBeta %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  ungroup() %>%
  mutate(x = n():1)  # for plotting

plTopicWeights11_2 <- TopicsTop %>%
  mutate(topic=factor(topic)) %>%
  ggplot(aes(x=x,y=beta,fill=topic)) + 
  geom_bar(stat='identity',show.legend = F) + 
  coord_flip() + 
  facet_wrap(~topic,scales='free') +
  scale_x_continuous(breaks = TopicsTop$x,
                     labels = TopicsTop$term,
                     expand = c(0,0)) + 
  labs(title='Topic Model with both Unigrams and Bigrams',
       subtitle = paste0(theNumTopics,' Topic LDA of ',
                         prettyNum(nrow(reviewLength),big.mark=",",scientific=FALSE), ' iPhone 11 Pro Max Reviews'),
       x = 'word',
       y = 'beta')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5),
        axis.text.y = element_text(size = 6))
plTopicWeights11_2

theTopicsBetaW <- select(spread(tidy(theLDA, matrix = "beta"),term,beta),-topic)
theTopicsGammaW <- select(spread(tidy(theLDA, matrix = "gamma"),topic,gamma),-document)
theTerms <- colnames(theTopicsBetaW)

theVocab <- vocab %>%
  mutate(word = factor(lemma1,levels=theTerms)) %>%
  arrange(word) %>%
  mutate(word=as.character(word))

json <- createJSON(
  phi = theTopicsBetaW, 
  theta = theTopicsGammaW, 
  doc.length = reviewLength$n, 
  vocab = theTerms, 
  R = theNumTopics,
  term.frequency = theVocab$n
)

serVis(json)